This study aims to uncover gene expression differences in macrophages infected with various strains of Acinetobacter baumannii using RNA sequencing analysis.
A variety of R packages was used for this analysis. All graphics and data wrangling were handled using the tidyverse suite of packages. All packages used are available from the Comprehensive R Archive Network (CRAN), Bioconductor.org, or Github.
Raw reads were mapped to the human reference transcriptome by the Functional Genomics Core at the University of South Carolina who also carried out a quality assessment of the raw reads.
After receiving mapped and aligned data from the Functional Genomics Core, the data was loaded into R and columns including gene expression data for THP1mock and THP1 macrophages infected with 19606 were extracted.
# This step involves reading the csv file into R
library(tidyverse)
library(edgeR)
library(matrixStats)
library(cowplot)
library(dplyr)
# Load the tpm and cpm data
tpm_data <- read.csv("tpm_Human.csv", header = FALSE)
cpm_data <- read.csv("cpm_Human.csv", header = FALSE)
# Step 2: Data wrangling----
# Set column names explicitly for tpm and cpm
colnames(tpm_data) <- c("gene_id", "THPMock_Rep1", "THPMock_Rep2", "THPMock_Rep3",
"THP19606_Rep1", "THP19606_Rep2", "THP19606_Rep3",
"THPNFAB1_Rep1", "THPNFAB1_Rep2", "THPNFAB1_Rep3",
"THPNFAB2_Rep1", "THPNFAB2_Rep2", "THPNFAB2_Rep3")
colnames(cpm_data) <- c("gene_id", "THPMock_Rep1", "THPMock_Rep2", "THPMock_Rep3",
"THP19606_Rep1", "THP19606_Rep2", "THP19606_Rep3",
"THPNFAB1_Rep1", "THPNFAB1_Rep2", "THPNFAB1_Rep3",
"THPNFAB2_Rep1", "THPNFAB2_Rep2", "THPNFAB2_Rep3")
# Remove the first row if it contains headers and convert expression columns to numeric
tpm_data <- tpm_data[-1, ]
cpm_data <- cpm_data[-1, ]
tpm_data[,-1] <- lapply(tpm_data[,-1], as.numeric)
cpm_data[,-1] <- lapply(cpm_data[,-1], as.numeric)
# Extract columns for THP1 mock and 19606
tpm_mock_19606 <- tpm_data %>%
select(gene_id, starts_with("THPMock_"), starts_with("THP19606_"))
cpm_mock_19606 <- cpm_data %>%
select(gene_id, starts_with("THPMock_"), starts_with("THP19606_"))
# Convert the expression columns to numeric
tpm_mock_19606[,-1] <- lapply(tpm_mock_19606[,-1], as.numeric)
# Log2 transform the TPM data
tpm_mock_19606[,-1] <- log2(tpm_mock_19606[,-1] + 1)
# Pivot longer for ggplot2
tpm_data_long_mock_19606 <- tpm_mock_19606 %>%
pivot_longer(cols = -gene_id,
names_to = "samples",
values_to = "log_expression")
cpm_data_long_mock_19606 <- cpm_mock_19606 %>%
pivot_longer(cols = starts_with("THP"),
names_to = "samples",
values_to = "expression")
# Convert the expression column to numeric (already done above)
tpm_data_long_mock_19606$log_expression <- as.numeric(tpm_data_long_mock_19606$log_expression)
cpm_data_long_mock_19606$expression <- as.numeric(cpm_data_long_mock_19606$expression)library(tidyverse)
library(edgeR)
library(matrixStats)
library(cowplot)
# Create DGEList object with gene_id as rownames
dge <- DGEList(counts = as.matrix(cpm_mock_19606[,-1]))
rownames(dge) <- cpm_mock_19606$gene_id
# Unfiltered, non-normalized CPM violin plot
log2.cpm <- cpm(dge, log=TRUE)
log2.cpm.df <- as_tibble(log2.cpm, rownames = "geneID")
colnames(log2.cpm.df) <- c("geneID", colnames(cpm_mock_19606)[-1])
log2.cpm.df.pivot <- pivot_longer(log2.cpm.df, cols = starts_with("THP"), names_to = "samples", values_to = "expression")
p1 <- ggplot(log2.cpm.df.pivot) +
aes(x = samples, y = expression, fill = samples) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun = "median", geom = "point", shape = 95, size = 10, color = "black", show.legend = FALSE) +
labs(y = "Log2 CPM", x = "Sample", title = "Log2 CPM Distribution", subtitle = "Unfiltered, non-normalized", caption = paste0("Produced on ", Sys.time())) +
theme_bw()
# Filter genes with low counts
keepers <- rowSums(cpm(dge) > 1) >= 5
dge_filtered <- dge[keepers,]
# Filtered, non-normalized CPM violin plot
log2.cpm.filtered <- cpm(dge_filtered, log=TRUE)
log2.cpm.filtered.df <- as_tibble(log2.cpm.filtered, rownames = "geneID")
colnames(log2.cpm.filtered.df) <- c("geneID", colnames(cpm_data)[-1])
log2.cpm.filtered.df.pivot <- pivot_longer(log2.cpm.filtered.df, cols = starts_with("THP"), names_to = "samples", values_to = "expression")
p2 <- ggplot(log2.cpm.filtered.df.pivot) +
aes(x = samples, y = expression, fill = samples) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun = "median", geom = "point", shape = 95, size = 10, color = "black", show.legend = FALSE) +
labs(y = "Log2 CPM", x = "Sample", title = "Log2 CPM Distribution", subtitle = "Filtered, non-normalized", caption = paste0("Produced on ", Sys.time())) +
theme_bw()
# Filtered, TMM-normalized CPM violin plot
# Normalize using TMM
dge_filtered_norm <- calcNormFactors(dge_filtered, method = "TMM")
# Log2 transform the filtered and normalized CPM data
log2.cpm.filtered.norm <- cpm(dge_filtered_norm, log=TRUE)
log2.cpm.filtered.norm.df <- as_tibble(log2.cpm.filtered.norm, rownames = "geneID")
colnames(log2.cpm.filtered.norm.df) <- c("geneID", colnames(cpm_data)[-1])
log2.cpm.filtered.norm.df.pivot <- pivot_longer(log2.cpm.filtered.norm.df, cols = starts_with("THP"), names_to = "samples", values_to = "expression")
p3 <- ggplot(log2.cpm.filtered.norm.df.pivot) +
aes(x = samples, y = expression, fill = samples) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun = "median", geom = "point", shape = 95, size = 10, color = "black", show.legend = FALSE) +
labs(y = "Log2 CPM", x = "Sample", title = "Log2 CPM Distribution", subtitle = "Filtered, TMM normalized", caption = paste0("Produced on ", Sys.time())) +
theme_bw()
# Combine plots into a grid
p_combined <- plot_grid(p1, p2, p3, labels = c('A', 'B', 'C'), label_size = 12)
print(p_combined)Filtering was carried out to remove lowly expressed genes. Genes with less than 1 count per million (CPM) in at least 5 or more samples filtered out. This reduced the number of genes from 53837 to 14969.
library(tidyverse)
library(DT)
library(gt)
library(plotly)
# Compute average expression and log fold change
log2.cpm.filtered.norm.df <- log2.cpm.filtered.norm.df %>%
mutate(
mock_AVG = rowMeans(select(., starts_with("THPMock_"))),
infected_AVG = rowMeans(select(., starts_with("THP19606_"))),
LogFC = infected_AVG - mock_AVG
) %>%
mutate_if(is.numeric, round, 2)
# Display table with datatable
datatable(log2.cpm.filtered.norm.df[,c("geneID", "mock_AVG", "infected_AVG", "LogFC")],
extensions = c('KeyTable', "FixedHeader"),
filter = 'top',
options = list(keys = TRUE,
searchHighlight = TRUE,
pageLength = 10,
lengthMenu = c("10", "25", "50", "100")))The table shown above includes expression data for 14969 genes. You can sort and search the data directly from the table.
# Assuming that the sample labels are in the same order as in the data
# Define treatment, genotype, and sample name vectors
sampleLabels <- c("THPMock_Rep1", "THPMock_Rep2", "THPMock_Rep3",
"THP19606_Rep1", "THP19606_Rep2", "THP19606_Rep3")
treatment <- c("Mock", "Mock", "Mock", "19606", "19606", "19606")
genotype <- c("THP", "THP", "THP", "THP", "THP", "THP")
# Define the treatment groups
group <- factor(c(rep("THPMock", 3), rep("THP19606", 3)))
# Create the design matrix
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
# Perform PCA
pca.res <- prcomp(t(log2.cpm.filtered.norm[, sampleLabels]), scale. = FALSE, retx = TRUE)
# Calculate the percentage of variance explained by each principal component
pc.var <- pca.res$sdev^2
pc.per <- round(pc.var / sum(pc.var) * 100, 1)
# Create a data frame for PCA results
pca.res.df <- as_tibble(pca.res$x)
pca.res.df <- mutate(pca.res.df,
sampleName = sampleLabels,
treatment = treatment,
genotype = genotype)
# Create PCA plot
pca.plot <- ggplot(pca.res.df, aes(x = PC1, y = PC2, label = sampleName, color = treatment)) +
geom_point(size = 4) +
stat_ellipse(aes(group = group)) +
xlab(paste0("PC1 (", pc.per[1], "%)")) +
ylab(paste0("PC2 (", pc.per[2], "%)")) +
labs(title = "PCA plot",
caption = paste0("Produced on ", Sys.time())) +
coord_fixed() +
theme_bw()
# Convert ggplot to plotly for interactivity
pca.plotly <- ggplotly(pca.plot) %>%
layout(legend = list(title = list(text = 'Treatment')))
pca.plotlylibrary(tidyverse)
library(limma)
library(edgeR)
library(gt)
library(DT)
library(plotly)
# Step 7: Differential Expression Analysis----
# Apply voom transformation
v.DEGList.filtered.norm <- voom(dge_filtered_norm, design, plot = FALSE)
# Fit the linear model
fit <- lmFit(v.DEGList.filtered.norm, design)
# Create contrast matrix
contrast.matrix <- makeContrasts(infection = THP19606 - THPMock, levels=design)
# Fit the contrasts
fits <- contrasts.fit(fit, contrast.matrix)
ebFit <- eBayes(fits)
# Get the top table with adjusted p-values
myTopHits <- topTable(ebFit, adjust = "BH", coef = 1, number = 40000, sort.by = "logFC")
# Convert to tibble for easier handling
myTopHits.df <- as_tibble(rownames_to_column(myTopHits, var = "geneID"))
# Step 8: Volcano plot----
vplot <- ggplot(myTopHits.df) +
aes(y = -log10(adj.P.Val), x = logFC, text = paste("Symbol:", geneID)) +
geom_point(size = 2) +
geom_hline(yintercept = -log10(0.01), linetype = "longdash", colour = "grey", linewidth = 1) +
geom_vline(xintercept = 1, linetype = "longdash", colour = "#BE684D", linewidth = 1) +
geom_vline(xintercept = -1, linetype = "longdash", colour = "#2C467A", linewidth = 1) +
#annotate("rect", xmin = 1, xmax = 12, ymin = -log10(0.01), ymax = 7.5, alpha=.2, fill="#BE684D") +
#annotate("rect", xmin = -1, xmax = -12, ymin = -log10(0.01), ymax = 7.5, alpha=.2, fill="#2C467A") +
labs(title = "Volcano Plot",
subtitle = "19606 vs THP1mock",
caption = paste0("Produced on ", Sys.time())) +
theme_bw()
# Convert to plotly for interactivity
vplotly <- ggplotly(vplot)
vplotlyTo identify differentially expressed genes, precision weights were first applied to each gene based on its mean-variance relationship using VOOM, then data was normalized using the TMM method in EdgeR. Linear modeling and bayesian stats were employed via Limma to find genes that were up- or down-regulated in THP1 macrophages infected with 19606 by 4-fold or more, with a false-discovery rate (FDR) of 0.01.
results <- decideTests(ebFit, method="global", adjust.method="BH", p.value=0.01, lfc=2)
colnames(v.DEGList.filtered.norm$E) <- sampleLabels
diffGenes <- v.DEGList.filtered.norm$E[results[,1] !=0,]
diffGenes.df <- as_tibble(diffGenes, rownames = "geneID")
datatable(diffGenes.df,
extensions = c('KeyTable', "FixedHeader"),
caption = 'Table 1: DEGs in THP1 Mock vs 19606',
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:11), digits=2)Pearson correlation was used to cluster 542 differentially expressed genes, which were then represented as heatmap with the data scaled by Zscore for each row.
library(tidyverse)
library(gplots)
library(RColorBrewer)
# Define heatmap colors
myheatcolors <- rev(brewer.pal(name="RdBu", n=11))
# Ensure diffGenes matrix is correctly formatted with genes as rows and samples as columns
# Cluster rows by Pearson correlation and columns by Spearman correlation
clustRows <- hclust(as.dist(1 - cor(t(diffGenes), method="pearson")), method="complete")
clustColumns <- hclust(as.dist(1 - cor(diffGenes, method="spearman")), method="complete")
# Assign modules using hierarchical clustering
module.assign <- cutree(clustRows, k=2) # k is the number of clusters
module.color <- rainbow(length(unique(module.assign)), start=0.1, end=0.9)
module.color <- module.color[as.vector(module.assign)]
# Heatmap for all differentially expressed genes
heatmap.2(as.matrix(diffGenes),
Rowv=as.dendrogram(clustRows),
Colv=as.dendrogram(clustColumns),
RowSideColors=module.color,
col=myheatcolors, scale='row', labRow=NA,
density.info="none", trace="none",
cexRow=1, cexCol=1, margins=c(8,20))# Extract and plot upregulated module
modulePick <- 1 # Pick the module number
myModule_up <- diffGenes[names(module.assign[module.assign %in% modulePick]),]
hrsub_up <- hclust(as.dist(1 - cor(t(myModule_up), method="pearson")), method="complete")
heatmap.2(as.matrix(myModule_up),
Rowv=as.dendrogram(hrsub_up),
Colv=NA,
labRow=NA,
col=myheatcolors, scale="row",
density.info="none", trace="none",
RowSideColors=module.color[module.assign %in% modulePick], margins=c(8,20))modulePick <- 2
myModule_down <- diffGenes[names(module.assign[module.assign %in% modulePick]),]
hrsub_down <- hclust(as.dist(1 - cor(t(myModule_down), method="pearson")), method="complete")
heatmap.2(myModule_down,
Rowv=as.dendrogram(hrsub_down),
Colv=NA,
labRow=NA,
col=myheatcolors, scale="row",
density.info="none", trace="none",
RowSideColors=module.color[module.assign %in% modulePick],
dendrogram='row',
margins=c(8,20))GO enrichment for the 14969 genes induced by infection
library(tidyverse)
library(limma)
library(gplots) #for heatmaps
library(DT) #interactive and searchable tables of our GSEA results
library(GSEABase) #functions and methods for Gene Set Enrichment Analysis
library(Biobase) #base functions for bioconductor; required by GSEABase
library(GSVA) #Gene Set Variation Analysis, a non-parametric and unsupervised method for estimating variation of gene set enrichment across samples.
library(gprofiler2) #tools for accessing the GO enrichment results using g:Profiler web resources
library(clusterProfiler) # provides a suite of tools for functional enrichment analysis
library(msigdbr) # access to msigdb collections directly within R
library(enrichplot) # great for making the standard GSEA enrichment plots
# Perform GO enrichment analysis for upregulated module
gost.res_up <- gost(rownames(myModule_up), organism = "hsapiens", correction_method = "fdr")
gostplot(gost.res_up, interactive = TRUE, capped = TRUE) # set interactive=FALSE to get plot for publications# Perform GO enrichment analysis for downregulated module
gost.res_down <- gost(rownames(myModule_down), organism = "hsapiens", correction_method = "fdr")
gostplot(gost.res_down, interactive = TRUE, capped = TRUE) # set interactive=FALSE to get plot for publications# Prepare the GSEA Input
mydata.df <- myTopHits.df
mydata.df.sub <- dplyr::select(mydata.df, geneID, logFC)
mydata.gsea <- mydata.df.sub$logFC
names(mydata.gsea) <- as.character(mydata.df.sub$geneID)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)
# Retrieve Gene Sets for Enrichment Analysis
hs_gsea_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>%
dplyr::select(gs_name, gene_symbol)
# run GSEA using the 'GSEA' function from clusterProfiler
myGSEA.res <- GSEA(mydata.gsea, TERM2GENE = hs_gsea_c2, verbose = FALSE)
myGSEA.df <- as_tibble(myGSEA.res@result)
# view results as an interactive table
datatable(myGSEA.df,
extensions = c('KeyTable', "FixedHeader"),
caption = 'Signatures enriched in 19606 vs THP1 Mock',
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns = c(3:10), digits = 2)# create enrichment plots using the enrichplot package
gseaplot2(myGSEA.res,
geneSetID = 47, # replace with your specific gene set ID
pvalue_table = FALSE, # can set this to FALSE for a cleaner plot
title = myGSEA.res$Description[47]) # can also turn off this title# Add a variable to this result that matches enrichment direction with phenotype
myGSEA.df <- myGSEA.df %>%
mutate(phenotype = case_when(
NES > 0 ~ "disease",
NES < 0 ~ "healthy"))
# Create 'bubble plot' to summarize y signatures across x phenotypes
ggplot(myGSEA.df[1:20,], aes(x = phenotype, y = ID)) +
geom_point(aes(size = setSize, color = NES, alpha = -log10(p.adjust))) +
scale_color_gradient(low = "blue", high = "red") +
theme_bw()# Plotting the enrichment results
enrich_plot <- dotplot(myGSEA.res, showCategory = 10, title = "GSEA Enrichment Plot")
print(enrich_plot)Describe the results in your own words. Some things to think about:
The output from running ‘sessionInfo’ is shown below and details all packages and version necessary to reproduce the results in this report.
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] enrichplot_1.24.0 msigdbr_7.5.1 clusterProfiler_4.12.0
## [4] gprofiler2_0.2.3 GSVA_1.52.3 GSEABase_1.66.0
## [7] graph_1.82.0 annotate_1.82.0 XML_3.99-0.17
## [10] AnnotationDbi_1.66.0 IRanges_2.38.1 S4Vectors_0.42.1
## [13] Biobase_2.64.0 BiocGenerics_0.50.0 RColorBrewer_1.1-3
## [16] gplots_3.1.3.1 plotly_4.10.4 gt_0.11.0
## [19] DT_0.33 cowplot_1.1.3 matrixStats_1.3.0
## [22] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
## [25] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [28] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1
## [31] tidyverse_2.0.0 edgeR_4.2.0 limma_3.60.3
## [34] knitr_1.48 tinytex_0.51 rmarkdown_2.27
##
## loaded via a namespace (and not attached):
## [1] later_1.3.2 splines_4.4.0
## [3] ggplotify_0.1.2 bitops_1.0-7
## [5] polyclip_1.10-6 lifecycle_1.0.4
## [7] lattice_0.22-6 MASS_7.3-61
## [9] crosstalk_1.2.1 magrittr_2.0.3
## [11] sass_0.4.9 jquerylib_0.1.4
## [13] yaml_2.3.9 httpuv_1.6.15
## [15] DBI_1.2.3 abind_1.4-7
## [17] zlibbioc_1.50.0 GenomicRanges_1.56.1
## [19] RCurl_1.98-1.16 ggraph_2.2.1
## [21] yulab.utils_0.1.4 tweenr_2.0.3
## [23] GenomeInfoDbData_1.2.12 ggrepel_0.9.5
## [25] irlba_2.3.5.1 tidytree_0.4.6
## [27] codetools_0.2-20 DelayedArray_0.30.1
## [29] DOSE_3.30.1 xml2_1.3.6
## [31] ggforce_0.4.2 tidyselect_1.2.1
## [33] aplot_0.2.3 UCSC.utils_1.0.0
## [35] farver_2.1.2 ScaledMatrix_1.12.0
## [37] viridis_0.6.5 jsonlite_1.8.8
## [39] tidygraph_1.3.1 tools_4.4.0
## [41] treeio_1.28.0 Rcpp_1.0.12
## [43] glue_1.7.0 gridExtra_2.3
## [45] SparseArray_1.4.8 xfun_0.45
## [47] qvalue_2.36.0 MatrixGenerics_1.16.0
## [49] GenomeInfoDb_1.40.1 HDF5Array_1.32.0
## [51] withr_3.0.0 fastmap_1.2.0
## [53] rhdf5filters_1.16.0 fansi_1.0.6
## [55] caTools_1.18.2 digest_0.6.36
## [57] rsvd_1.0.5 mime_0.12
## [59] gridGraphics_0.5-1 timechange_0.3.0
## [61] R6_2.5.1 colorspace_2.1-0
## [63] GO.db_3.19.1 gtools_3.9.5
## [65] RSQLite_2.3.7 utf8_1.2.4
## [67] generics_0.1.3 data.table_1.15.4
## [69] graphlayouts_1.1.1 httr_1.4.7
## [71] htmlwidgets_1.6.4 S4Arrays_1.4.1
## [73] scatterpie_0.2.3 pkgconfig_2.0.3
## [75] gtable_0.3.5 blob_1.2.4
## [77] SingleCellExperiment_1.26.0 XVector_0.44.0
## [79] shadowtext_0.1.3 htmltools_0.5.8.1
## [81] fgsea_1.30.0 scales_1.3.0
## [83] png_0.1-8 SpatialExperiment_1.14.0
## [85] ggfun_0.1.5 rstudioapi_0.16.0
## [87] tzdb_0.4.0 reshape2_1.4.4
## [89] rjson_0.2.21 nlme_3.1-165
## [91] cachem_1.1.0 rhdf5_2.48.0
## [93] KernSmooth_2.23-24 parallel_4.4.0
## [95] HDO.db_0.99.1 pillar_1.9.0
## [97] grid_4.4.0 vctrs_0.6.5
## [99] promises_1.3.0 BiocSingular_1.20.0
## [101] beachmat_2.20.0 xtable_1.8-4
## [103] evaluate_0.24.0 magick_2.8.3
## [105] cli_3.6.3 locfit_1.5-9.10
## [107] compiler_4.4.0 rlang_1.1.4
## [109] crayon_1.5.3 labeling_0.4.3
## [111] plyr_1.8.9 fs_1.6.4
## [113] stringi_1.8.4 viridisLite_0.4.2
## [115] BiocParallel_1.38.0 babelgene_22.9
## [117] munsell_0.5.1 Biostrings_2.72.1
## [119] lazyeval_0.2.2 GOSemSim_2.30.0
## [121] Matrix_1.7-0 patchwork_1.2.0
## [123] hms_1.1.3 sparseMatrixStats_1.16.0
## [125] bit64_4.0.5 Rhdf5lib_1.26.0
## [127] shiny_1.8.1.1 KEGGREST_1.44.1
## [129] statmod_1.5.0 SummarizedExperiment_1.34.0
## [131] highr_0.11 igraph_2.0.3
## [133] memoise_2.0.1 bslib_0.7.0
## [135] ggtree_3.12.0 fastmatch_1.1-5
## [137] bit_4.0.5 gson_0.1.0
## [139] ape_5.8